knitr::opts_chunk$set(
echo = TRUE,
message = FALSE,
warning = FALSE
)
library(tidyverse) #used for data wrangling and viz
library(here) #simplifies file paths
library(rsample) #used to split data
library(janitor) #used to clean data names
library(corrplot) #for correlation viz
library(VIM) #for missing data viz
library(mice) #for imputing missing data
library(ggfortify) #for PCA viz
library(fastDummies) #for dummy coding
library(caret) #for cross validation
library(class) #for knn
library(gbm) #for boosted trees modeling
library(randomForest) #for random forest modeling
library(maptools)
library(RColorBrewer)
library(classInt)
library(ggplot2)
library(ggrepel)
library(mapproj)
library(viridis)
library(pander)
library(knitr)
#turns scientific notation off
options(scipen = 100)
#some of our workflow includes randomness. here we set a seed so our workflow can be reproduced without worrying about random variation
set.seed(123) The purpose of this project is to use machine learning techniques to predict which wildfires will grow to a catastrophic size. Our most accurate model will then be used to assess how global climate change may influence a wildfires predicted size.
fire situation in North America, climate change, etc
What will it be used for?
This dataset is available on Kaggle. It is a subset of larger fires
Important attributes include
fire_size - the size of the fire in acres
stat_cause_descr - the cause of the fire
vegetation - code corresponding to vegetation type
temp_cont - temperature (Celsius) when the fire was contained
A complete codebook is available in the project file
#read in dataset
us_wildfire <- read_csv(here("archive", "FW_Veg_Rem_Combined.csv"))
#Following to the National Geographic Education
us_wildfire$region = "NorthEast"
us_wildfire[which(us_wildfire$state %in% c("AK", "WA", "OR", "CA", "NV", "ID", "MT", "WY", "UT", "CO")),"region"] = "West"
us_wildfire[which(us_wildfire$state %in% c("AZ", "NM", "TX", "OK")), "region"] = "SouthWest"
us_wildfire[which(us_wildfire$state %in% c("ND", "SD", "NE", "KS", "MN", "IA", "MO", "WI", "IL", "MI", "IN", "OH")), "region"] = "MidWest"
us_wildfire[which(us_wildfire$state %in% c("AR", "LA", "MS", "AL", "GA", "FL", "SC", "NC", "TN", "KY", "VA", "WV")), "region"] = "SouthEast"
us_wildfire$year_diff = us_wildfire$disc_pre_year - min(us_wildfire$disc_pre_year)Our methods involved cleaning the data, conducting exploratory analysis, imputing missing data, and using Principle Components Analysis to reduce the number of features.
To clean the data we first converted all column names to lower snake case using the clean_names() function. Then we selected only the columns were were interested in using, thereby removing all the meaningless columns.
us_wildfire_clean <- us_wildfire %>%
dplyr::select(fire_name:year_diff) %>% #select columns we want to use
clean_names() #convert to lowercase snakeWe are interested in using weather to predict fire size, so we filter out observations where the weather data was not associated with the fire observation.
There are some weather observations for which every weather field is 0. This seems unlikely, especially for temperature and humidity observations. So we will replace them with NAs. Since precipitation is frequently 0, we will not replace all 0 with NAs in the precipitation column.
Instead, we will follow the pattern of NAs seen in other the other weather columns. We can see that there is a clear pattern in the missing weather data. when temp_pre_30 is missing, so is wind_pre_30 and humidity_pre_30. We will assume this extends to prec_pre_30 and replace that 0 with a NA.
#filter out observations that do not have a weather file
us_wildfire_clean <- us_wildfire_clean %>%
filter(weather_file != "File Not Found")
#replace 0 with NA
us_wildfire_clean <- us_wildfire_clean %>%
mutate_at(vars(temp_pre_30:hum_cont), ~na_if(., 0.0000000))
#when temp, wind, and humidity are missing, we suspect precipitation is as well.
#we use a case when statement to replace these zeros with NAs
us_wildfire_clean <- us_wildfire_clean %>%
mutate(prec_pre_30 = case_when(is.na(temp_pre_30) & is.na(hum_pre_30) & is.na(wind_pre_30) ~ NA_real_,
TRUE ~ prec_pre_30)) %>%
mutate(prec_pre_15 = case_when(is.na(temp_pre_15) & is.na(hum_pre_15) & is.na(wind_pre_15) ~ NA_real_,
TRUE ~ prec_pre_15)) %>%
mutate(prec_pre_7 = case_when(is.na(temp_pre_7) & is.na(hum_pre_7) & is.na(wind_pre_7) ~ NA_real_,
TRUE ~ prec_pre_7)) %>%
mutate(prec_cont = case_when(is.na(temp_cont) & is.na(hum_cont) & is.na(wind_cont) ~ NA_real_,
TRUE ~ prec_cont))Information about vegetation was stored in a numeric form, where each number represented a class of vegetation. The key for the vegetation codes was in the provided metadata. For ease of interpretation, we replaced the numeric code with a short description the vegetation type.
According the metadata for this data set, the vegetation was created by interpolating most likely vegetation based on latitude and longitude. The most common vegetation type is listed as “Polar Desert/Rock/Ice” and this seems very unlikely. Although we keep the vegetation feature in the data, we have doubts about its accuracy.
#Here we label vegetation according to the provided codebook
us_wildfire_clean <- us_wildfire_clean %>%
mutate(vegetation_classed = case_when(
vegetation == 12 ~ "Open Shrubland",
vegetation == 15 ~ "Polar Desert/Rock/Ice",
vegetation == 16 ~ "Secondary Tropical Evergreen Broadleaf Forest",
vegetation == 4 ~ "Temperate Evergreen Needleleaf Forest TmpENF",
vegetation == 9 ~ "C3 Grassland/Steppe",
vegetation == 14 ~ "Desert"
))Our final data cleaning action was to clean up date columns. Since there are redundant date columns, we chose to keep only month and year as variables. We also reclassified months into seasons, so there would be fewer factor levels or dummy coding involved when it came time to prepare the data for modeling.
#keep month and year as variables
us_wildfire_clean <- us_wildfire_clean %>%
dplyr::select(-disc_clean_date,
-disc_date_pre,
-cont_clean_date,
-disc_pre_month,
-disc_date_final,
-cont_date_final,
-putout_time)
#we also reclassify months into seasons, to reduce factor levels
us_wildfire_clean <- us_wildfire_clean %>%
mutate(season = case_when(discovery_month %in% c("Apr", "May", "Jun") ~ "Spring",
discovery_month %in% c("Jul", "Aug", "Sep") ~ "Summer",
discovery_month %in% c("Oct", "Nov", "Dec") ~ "Fall",
discovery_month %in% c("Jan", "Feb", "Mar") ~ "Winter")) %>%
select(-discovery_month)Our next step was to explore the data. We created several figures to investigate fire observations, as well as the pattern of missing data and the correlation between variables.
First we investigated fire size through time. This figure shows that the total acrage burned per year has been increasing almost linearly since 1990. This is a clue that Year might be a useful feature in our models.
#summarise acres per year burned
acres_per_year <- us_wildfire_clean %>%
group_by(disc_pre_year) %>%
summarise(acres_burned = sum(fire_size))
#fire size (finalized graph)
ggplot(data = acres_per_year) +
geom_point(aes(x = disc_pre_year,
y = acres_burned,
size = acres_burned,
color = acres_burned)) +
scale_color_continuous(high = "firebrick", low = "goldenrod1") +
labs(x = "Year", y = "Total Acres Burned",
title = "Total acres burned per year from 1990 to 2015") +
theme_minimal() +
theme(legend.position = "none")We also looked at the most common causes of fires. We found that most fires were started by debris burning, followed by arson. Yikes! We were also surprised that a children were frequently listed as the cause of wildfires.
#most common causes of fire
fire_causes <- us_wildfire_clean %>%
group_by(stat_cause_descr) %>%
count()
#cause (finalized)
ggplot(data = fire_causes, aes(y = reorder(stat_cause_descr, n), x = n)) +
geom_col(aes(fill = n)) +
scale_fill_gradient(high = "firebrick", low = "goldenrod1") +
labs(x = "Number of Fires",
y = "Cause",
tite = "Number of fires per listed starting cause") +
theme_minimal() +
theme(legend.position = "none")us_wildfire_clean$class_fac = factor(us_wildfire_clean$fire_size_class, levels = c("A", "B", "C", "D", "E", "F", "G"))
us <- map_data("world", 'usa')
state <- map_data("state")
state$region2 = "NorthEast"
state[which(state$region %in% c("alaska", "washington", "oregon", "california", "nevada", "idaho", "montana", "utah", "wyoming", "colorado")), "region2"] = "West"
state[which(state$region %in% c("arizona", "new mexico", "oklahoma", "texas")), "region2"] = "SouthWest"
state[which(state$region %in% c("north dakota", "south dakota", "nebraska", "kansas", "minnesota", "iowa", "missouri", "wisconsin", "illinois", "indiana", "michigan", "ohio")), "region2"] = "MidWest"
state[which(state$region %in% c("arkansas", "louisiana", "mississippi", "alabama", "florida", "georgia", "south carolina", "north carolina", "tennessee", "kentucky", "virginia", "west virginia")), "region2"] = "SouthEast"
#state$region = as.factor(state$region)
ggplot(data=state, aes(x=long, y=lat, group = region)) +
geom_polygon(aes(fill=region2)) +
ggtitle("US Region")+
guides(fill=guide_legend(title="Region"))+
coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))ggplot() +
geom_polygon(data=state, aes(x=long, y=lat, group=group), color = "white", fill = "grey") +
geom_point(data = us_wildfire_clean, aes(x=longitude, y = latitude, color = class_fac)) +
scale_color_brewer(palette = "YlOrRd")+
ggtitle("US Wildfire Distribution")+
guides(color=guide_legend(title="Wild Fire Scale"))+
coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))ggplot() +
geom_polygon(data=state, aes(x=long, y=lat, group=group), color = "white", fill = "grey") +
geom_point(data = us_wildfire_clean[which(us_wildfire_clean$wstation_byear < 1970),], aes(x=longitude, y = latitude, color = class_fac)) +
scale_color_brewer(palette = "YlOrRd")+
ggtitle("US Wildfire Distribution before 1970")+
guides(color=guide_legend(title="Wild Fire Scale"))+
coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))ggplot() +
geom_polygon(data=state, aes(x=long, y=lat, group=group), color = "white", fill = "grey") +
geom_point(data = us_wildfire_clean[which(us_wildfire_clean$wstation_byear >= 1970 & us_wildfire_clean$wstation_byear < 2000),], aes(x=longitude, y = latitude, color = class_fac)) +
scale_color_brewer(palette = "YlOrRd")+
ggtitle("US Wildfire Distribution 1970-2000")+
guides(color=guide_legend(title="Wild Fire Scale"))+
coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))ggplot() +
geom_polygon(data=state, aes(x=long, y=lat, group=group), color = "white", fill = "grey") +
geom_point(data = us_wildfire_clean[which(us_wildfire_clean$wstation_byear >= 200),], aes(x=longitude, y = latitude, color = class_fac)) +
scale_color_brewer(palette = "YlOrRd")+
ggtitle("US Wildfire Distribution after 2000")+
guides(color=guide_legend(title="Wild Fire Scale"))+
coord_map(projection = "sinusoidal", xlim=c(-120, -75), ylim = c(25, 50))ggplot() +
geom_density(data= us_wildfire_clean[which(us_wildfire_clean$wstation_byear <= 1970 & us_wildfire_clean$fire_size > 100),], aes(x = fire_size, y=..density..),
alpha=.3,
colour="dodgerblue", fill="dodgerblue") +
geom_density(data= us_wildfire_clean[which(us_wildfire_clean$wstation_byear >= 1970 & us_wildfire_clean$fire_size > 100 & us_wildfire_clean$wstation_byear < 2000),], aes(x = fire_size, y=..density..),
alpha=.3,
colour="yellow3", fill="yellow3") +
geom_density(data= us_wildfire_clean[which(us_wildfire_clean$wstation_byear >= 2000 & us_wildfire_clean$fire_size > 100),], aes(x = fire_size, y=..density..),
alpha=.3,
colour="firebrick3", fill="firebrick3") +
xlim(10000, 100000) +
ggtitle("Wildfire Severeity")As part of our exploratory analysis, we we looked at the pattern of missing data. This figure shows missing data in red. You can see that most missing data is concentrated in the weather columns, and the name of the fire is also often missing.
#missing data plot
aggr_plot <- aggr(us_wildfire_clean,
col = c('yellow','red'),
numbers = TRUE,
sortVars = TRUE,
labels = names(us_wildfire_clean),
cex.axis = .7,
gap = 2,
ylab = c("Histogram of missing data","Pattern"))##
## Variables sorted by number of missings:
## Variable Count
## fire_name 0.50282019
## hum_cont 0.42786638
## wind_cont 0.41558884
## temp_cont 0.41002139
## prec_cont 0.41002139
## vegetation_classed 0.17480307
## hum_pre_7 0.14329476
## hum_pre_15 0.12783234
## wind_pre_7 0.12250802
## temp_pre_7 0.11173782
## prec_pre_7 0.11173782
## wind_pre_15 0.10663231
## temp_pre_15 0.09571623
## prec_pre_15 0.09571623
## hum_pre_30 0.09413595
## wind_pre_30 0.07308179
## temp_pre_30 0.06216571
## prec_pre_30 0.06216571
## fire_size 0.00000000
## fire_size_class 0.00000000
## stat_cause_descr 0.00000000
## latitude 0.00000000
## longitude 0.00000000
## state 0.00000000
## disc_pre_year 0.00000000
## wstation_usaf 0.00000000
## dstation_m 0.00000000
## wstation_wban 0.00000000
## wstation_byear 0.00000000
## wstation_eyear 0.00000000
## vegetation 0.00000000
## fire_mag 0.00000000
## weather_file 0.00000000
## remoteness 0.00000000
## region 0.00000000
## year_diff 0.00000000
## season 0.00000000
## class_fac 0.00000000
We hypothesized that weather variables were correlated. To test this out, we created a correlation matrix.
This figure shows that there is a strong correlation with each set of variables. I.e. the 7 day average temperature is correlated with the 30 day average temperature. This is not surprising!
#create a dataframe of weather
weather <- us_wildfire_clean %>%
dplyr::select(temp_pre_30:prec_cont)
#create a correlation matrix (omitting NAs)
cor_matrix <- cor(weather, use = "complete.obs")
#create a visualization
corrplot(cor_matrix, type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45)After exploring our data, we were ready to prepare it for modeling. We first split the data into test and training data, then do some data manipulations separately on the test and training data.
Before we split the data, we got rid of even more variables. Some we used in exploratory data analysis but decided not to use in modeling, and some were just not useful. Then we split our data into 80% training and 20% testing data. Because fire size is heavily skewed towards smaller fires, we used stratified sampling.
There are 32903 observations in the training set and 8229 observations in the test set.
#first we make a data frame containing only variables we want to use in our models
fire_modeling_df <- us_wildfire_clean %>%
dplyr::select(-fire_name, #remove fire name
-fire_size_class, #remove fire size class
-class_fac, #which is also a size class
-state, #because we have regions
-disc_pre_year, #because we have year_diff to represent year
-wstation_usaf, #remove weather station name
-wstation_wban, #remove this (not described in codebook)
-wstation_byear, #remove station year installed
-wstation_eyear, #remove station year ended data recording
-weather_file, #remove name of weather file
-dstation_m, #remove distance of weather station to fire
-vegetation #remove vegetation because we already have it classed
)
#define split parameters
us_wildfire_split <- fire_modeling_df %>%
initial_split(prop = 0.8, strata = "fire_size")
#write split data to data frames
fire_train <- training(us_wildfire_split)
fire_test <- testing(us_wildfire_split)
#set up folds in training data for cross validation
train_folds <- vfold_cv(fire_train, v = 10, repeats = 5)Next we impute (or replace) all the missing data in our weather columns. We use the predictive mean method of imputation, which works by predicting the value of the missing variable using regression, then randomly choosing a similar value from a pool of candidates that are found in the data. We use the default values of m = 5 and maxit = 5 to reduce computing time, but increasing the number of multiple imputations using the m argument and increasing the number of iterations through the maxit argument could increase accuracy.
#select weather cols
weather_train <- fire_train %>%
select(temp_pre_30:prec_cont)
weather_test <- fire_test %>%
select(temp_pre_30:prec_cont)
#select cols not in weather
notweather_train <- fire_train %>%
select(-colnames(weather_train))
notweather_test <- fire_test %>%
select(-colnames(weather_test))
#imputation of weather data
weather_train_imputed <- mice(weather_train, m=5, maxit=5, meth='pmm', seed=500) %>%
complete()
weather_test_imputed <- mice(weather_test, m=5, maxit=5, meth='pmm', seed=500) %>%
complete()#Do PCA on both test and train data separately
weather_train_PCA <- weather_train_imputed %>%
scale(center = T, scale = T) %>% #first scale and center the data
prcomp(rank. = 4) #do PCA
#weather_train_PCA$x
#Do PCA on both test and train data separately
weather_test_PCA <- weather_test_imputed %>%
scale(center = T, scale = T) %>% #first scale and center the data
prcomp(rank. = 4)
#Make a PCA bi-plot
autoplot(weather_train_PCA,
data = weather_train,
loadings = TRUE,
loadings.label = TRUE,
loadings.label.hjust = 1.2) +
theme_classic() +
labs(caption = "Principle Component Analysis Bi-plot of Weather Training Data")#put data back together
#bind imputed weather data to rest of rows
fire_train_complete <- cbind(notweather_train, weather_train_PCA$x) %>%
na.omit()
fire_test_complete <- cbind(notweather_test, weather_test_PCA$x) %>%
na.omit()First of all, we applied multiple linear regression models on the dataset. The performance of the multiple linear regression will present why we need to apply machine learning approaches. First of all, I started with the whole variables including principal components. And then, less relevant variables were deleted from to find the optimal multiple linear regression model.
lm_whole = lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed + stat_cause_descr, data = fire_train_complete)
summary(lm_whole) %>% pander| Estimate | Std. Error | t value | |
|---|---|---|---|
| (Intercept) | 3429 | 386.1 | 8.88 |
| PC1 | -738.6 | 39.65 | -18.63 |
| PC2 | -180.7 | 39.4 | -4.587 |
| PC3 | 268.1 | 48.03 | 5.582 |
| PC4 | -466.9 | 54.57 | -8.557 |
| year_diff | 57.18 | 12.22 | 4.68 |
| remoteness | -9138 | 631.2 | -14.48 |
| vegetation_classedDesert | -979.2 | 683.4 | -1.433 |
| vegetation_classedOpen Shrubland | -1474 | 308.7 | -4.775 |
| vegetation_classedPolar Desert/Rock/Ice | -616.3 | 307.2 | -2.006 |
| vegetation_classedSecondary Tropical Evergreen Broadleaf Forest | -1692 | 314.2 | -5.384 |
| vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF | -188.8 | 612.9 | -0.3081 |
| stat_cause_descrCampfire | 2165 | 523 | 4.139 |
| stat_cause_descrChildren | -584.4 | 515.4 | -1.134 |
| stat_cause_descrDebris Burning | -238 | 234.4 | -1.015 |
| stat_cause_descrEquipment Use | 626.9 | 334.7 | 1.873 |
| stat_cause_descrFireworks | -1309 | 1219 | -1.074 |
| stat_cause_descrLightning | 5165 | 297.3 | 17.37 |
| stat_cause_descrMiscellaneous | 334.6 | 271.3 | 1.233 |
| stat_cause_descrMissing/Undefined | 512.9 | 327.2 | 1.567 |
| stat_cause_descrPowerline | 1091 | 761.7 | 1.432 |
| stat_cause_descrRailroad | -157.8 | 585.1 | -0.2697 |
| stat_cause_descrSmoking | -575.3 | 547.7 | -1.05 |
| stat_cause_descrStructure | -527.5 | 1974 | -0.2672 |
| Pr(>|t|) | |
|---|---|
| (Intercept) | 0.0000000000000000007079 |
| PC1 | 0.00000000000000000000000000000000000000000000000000000000000000000000000000006054 |
| PC2 | 0.000004517 |
| PC3 | 0.00000002406 |
| PC4 | 0.00000000000000001215 |
| year_diff | 0.000002886 |
| remoteness | 0.00000000000000000000000000000000000000000000002534 |
| vegetation_classedDesert | 0.1519 |
| vegetation_classedOpen Shrubland | 0.000001807 |
| vegetation_classedPolar Desert/Rock/Ice | 0.04483 |
| vegetation_classedSecondary Tropical Evergreen Broadleaf Forest | 0.00000007335 |
| vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF | 0.758 |
| stat_cause_descrCampfire | 0.00003497 |
| stat_cause_descrChildren | 0.2568 |
| stat_cause_descrDebris Burning | 0.3101 |
| stat_cause_descrEquipment Use | 0.06102 |
| stat_cause_descrFireworks | 0.2829 |
| stat_cause_descrLightning | 0.0000000000000000000000000000000000000000000000000000000000000000003096 |
| stat_cause_descrMiscellaneous | 0.2174 |
| stat_cause_descrMissing/Undefined | 0.117 |
| stat_cause_descrPowerline | 0.152 |
| stat_cause_descrRailroad | 0.7874 |
| stat_cause_descrSmoking | 0.2935 |
| stat_cause_descrStructure | 0.7894 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 27229 | 12420 | 0.05937 | 0.05858 |
lm_whole2 <- update(lm_whole, . ~ . - stat_cause_descr)
summary(lm_whole2) %>% pander| Estimate | Std. Error | t value | |
|---|---|---|---|
| (Intercept) | 5079 | 346.2 | 14.67 |
| PC1 | -1041 | 36.71 | -28.37 |
| PC2 | -54.51 | 39.16 | -1.392 |
| PC3 | 198.2 | 48.17 | 4.114 |
| PC4 | -504.1 | 54.68 | -9.22 |
| year_diff | 61.56 | 12.08 | 5.098 |
| remoteness | -8807 | 633 | -13.91 |
| vegetation_classedDesert | -1449 | 688.1 | -2.106 |
| vegetation_classedOpen Shrubland | -2705 | 303.6 | -8.91 |
| vegetation_classedPolar Desert/Rock/Ice | -1128 | 308 | -3.662 |
| vegetation_classedSecondary Tropical Evergreen Broadleaf Forest | -3001 | 308.4 | -9.731 |
| vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF | -250 | 617.5 | -0.4048 |
| Pr(>|t|) | |
|---|---|
| (Intercept) | 0.000000000000000000000000000000000000000000000001574 |
| PC1 | 1.814e-174 |
| PC2 | 0.164 |
| PC3 | 0.00003896 |
| PC4 | 0.00000000000000000003176 |
| year_diff | 0.0000003451 |
| remoteness | 0.00000000000000000000000000000000000000000007579 |
| vegetation_classedDesert | 0.03525 |
| vegetation_classedOpen Shrubland | 0.0000000000000000005391 |
| vegetation_classedPolar Desert/Rock/Ice | 0.000251 |
| vegetation_classedSecondary Tropical Evergreen Broadleaf Forest | 0.0000000000000000000002423 |
| vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF | 0.6856 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 27229 | 12523 | 0.04317 | 0.04279 |
When we compare the whole region’s regression results, it shows that the updated model is superior than the others.
lm_sw = lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed + stat_cause_descr, data = fire_train_complete[which(fire_train_complete$region == "SouthWest"),])
summary(lm_sw) %>% pander| Estimate | Std. Error | t value | |
|---|---|---|---|
| (Intercept) | 1392 | 1360 | 1.024 |
| PC1 | -600.3 | 91.77 | -6.542 |
| PC2 | -632.1 | 91.56 | -6.903 |
| PC3 | 175.1 | 214.5 | 0.8164 |
| PC4 | -599.4 | 200.7 | -2.987 |
| year_diff | 106.7 | 32.31 | 3.301 |
| remoteness | -8083 | 3743 | -2.159 |
| vegetation_classedPolar Desert/Rock/Ice | -93.29 | 620.6 | -0.1503 |
| vegetation_classedSecondary Tropical Evergreen Broadleaf Forest | -456.3 | 424.3 | -1.075 |
| stat_cause_descrCampfire | 7782 | 1496 | 5.201 |
| stat_cause_descrChildren | -1297 | 1710 | -0.7583 |
| stat_cause_descrDebris Burning | -184.5 | 667 | -0.2766 |
| stat_cause_descrEquipment Use | -634.4 | 805.9 | -0.7872 |
| stat_cause_descrFireworks | -1586 | 3343 | -0.4744 |
| stat_cause_descrLightning | 2256 | 780 | 2.892 |
| stat_cause_descrMiscellaneous | 498.2 | 686.9 | 0.7254 |
| stat_cause_descrMissing/Undefined | 2786 | 913.5 | 3.049 |
| stat_cause_descrPowerline | 934.5 | 1229 | 0.7603 |
| stat_cause_descrRailroad | -827.6 | 2254 | -0.3671 |
| stat_cause_descrSmoking | -802.6 | 1358 | -0.5908 |
| stat_cause_descrStructure | 432.9 | 8078 | 0.0536 |
| Pr(>|t|) | |
|---|---|
| (Intercept) | 0.3059 |
| PC1 | 0.00000000006565 |
| PC2 | 0.000000000005581 |
| PC3 | 0.4143 |
| PC4 | 0.002832 |
| year_diff | 0.0009695 |
| remoteness | 0.03085 |
| vegetation_classedPolar Desert/Rock/Ice | 0.8805 |
| vegetation_classedSecondary Tropical Evergreen Broadleaf Forest | 0.2823 |
| stat_cause_descrCampfire | 0.0000002044 |
| stat_cause_descrChildren | 0.4483 |
| stat_cause_descrDebris Burning | 0.7821 |
| stat_cause_descrEquipment Use | 0.4312 |
| stat_cause_descrFireworks | 0.6352 |
| stat_cause_descrLightning | 0.003842 |
| stat_cause_descrMiscellaneous | 0.4682 |
| stat_cause_descrMissing/Undefined | 0.002302 |
| stat_cause_descrPowerline | 0.4471 |
| stat_cause_descrRailroad | 0.7135 |
| stat_cause_descrSmoking | 0.5547 |
| stat_cause_descrStructure | 0.9573 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 6326 | 13948 | 0.03803 | 0.03498 |
lm_sw2 <- update(lm_sw, . ~ . - stat_cause_descr - PC2 - remoteness)
summary(lm_sw2) %>% pander| Estimate | Std. Error | t value | |
|---|---|---|---|
| (Intercept) | -389.4 | 569.6 | -0.6837 |
| PC1 | -715.9 | 81.23 | -8.813 |
| PC3 | 450.8 | 210.5 | 2.141 |
| PC4 | -446.3 | 197.7 | -2.257 |
| year_diff | 86.15 | 31.08 | 2.772 |
| vegetation_classedPolar Desert/Rock/Ice | 1261 | 583.3 | 2.163 |
| vegetation_classedSecondary Tropical Evergreen Broadleaf Forest | 176.6 | 410.3 | 0.4305 |
| Pr(>|t|) | |
|---|---|
| (Intercept) | 0.4942 |
| PC1 | 0.00000000000000000156 |
| PC3 | 0.03227 |
| PC4 | 0.02403 |
| year_diff | 0.005586 |
| vegetation_classedPolar Desert/Rock/Ice | 0.0306 |
| vegetation_classedSecondary Tropical Evergreen Broadleaf Forest | 0.6669 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 6326 | 14053 | 0.02124 | 0.02031 |
lm_se = lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed + stat_cause_descr, data = fire_train_complete[which(fire_train_complete$region == "SouthEast"),])
summary(lm_se) %>% pander| Estimate | Std. Error | t value | |
|---|---|---|---|
| (Intercept) | -2518 | 2241 | -1.123 |
| PC1 | -6.769 | 28.15 | -0.2404 |
| PC2 | -74.23 | 20.92 | -3.549 |
| PC3 | 37.7 | 20.13 | 1.873 |
| PC4 | 9.194 | 27.98 | 0.3286 |
| year_diff | 2.743 | 5.846 | 0.4692 |
| remoteness | 19527 | 468.7 | 41.66 |
| vegetation_classedOpen Shrubland | -987 | 2240 | -0.4406 |
| vegetation_classedPolar Desert/Rock/Ice | -746.6 | 2241 | -0.3331 |
| vegetation_classedSecondary Tropical Evergreen Broadleaf Forest | -847 | 2240 | -0.3781 |
| stat_cause_descrCampfire | 18.38 | 238 | 0.07723 |
| stat_cause_descrChildren | 640.6 | 218.4 | 2.933 |
| stat_cause_descrDebris Burning | 227.5 | 91.92 | 2.475 |
| stat_cause_descrEquipment Use | 430.6 | 161.2 | 2.672 |
| stat_cause_descrFireworks | 251.5 | 1041 | 0.2417 |
| stat_cause_descrLightning | 1371 | 156.4 | 8.768 |
| stat_cause_descrMiscellaneous | 258.2 | 134.8 | 1.915 |
| stat_cause_descrMissing/Undefined | 242.6 | 149.5 | 1.623 |
| stat_cause_descrPowerline | 297.4 | 918.3 | 0.3239 |
| stat_cause_descrRailroad | 572.4 | 219.7 | 2.605 |
| stat_cause_descrSmoking | 354.6 | 248.7 | 1.426 |
| stat_cause_descrStructure | 178 | 894 | 0.1991 |
| Pr(>|t|) | |
|---|---|
| (Intercept) | 0.2613 |
| PC1 | 0.81 |
| PC2 | 0.0003882 |
| PC3 | 0.06109 |
| PC4 | 0.7425 |
| year_diff | 0.6389 |
| remoteness | 0 |
| vegetation_classedOpen Shrubland | 0.6595 |
| vegetation_classedPolar Desert/Rock/Ice | 0.7391 |
| vegetation_classedSecondary Tropical Evergreen Broadleaf Forest | 0.7053 |
| stat_cause_descrCampfire | 0.9384 |
| stat_cause_descrChildren | 0.003358 |
| stat_cause_descrDebris Burning | 0.01335 |
| stat_cause_descrEquipment Use | 0.007557 |
| stat_cause_descrFireworks | 0.809 |
| stat_cause_descrLightning | 0.000000000000000002049 |
| stat_cause_descrMiscellaneous | 0.05553 |
| stat_cause_descrMissing/Undefined | 0.1047 |
| stat_cause_descrPowerline | 0.746 |
| stat_cause_descrRailroad | 0.00919 |
| stat_cause_descrSmoking | 0.154 |
| stat_cause_descrStructure | 0.8422 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 12420 | 3878 | 0.1364 | 0.1349 |
lm_se2 <- update(lm_se, . ~ . - stat_cause_descr - PC1 - PC4- year_diff - vegetation_classed)
summary(lm_se2) %>% pander| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | -3163 | 87.46 | -36.16 | 2.37e-272 |
| PC2 | -44.3 | 19.84 | -2.233 | 0.02557 |
| PC3 | 25.06 | 18.28 | 1.371 | 0.1703 |
| remoteness | 19710 | 458.4 | 43 | 0 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 12420 | 3890 | 0.1298 | 0.1296 |
lm_mw = lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed + stat_cause_descr, data = fire_train_complete[which(fire_train_complete$region == "MidWest"),])
summary(lm_mw) %>% pander| Estimate | Std. Error | t value | |
|---|---|---|---|
| (Intercept) | 4277 | 551.2 | 7.76 |
| PC1 | -380.8 | 60.1 | -6.336 |
| PC2 | -124.7 | 49.19 | -2.534 |
| PC3 | 123 | 60.17 | 2.044 |
| PC4 | -126.3 | 74.51 | -1.695 |
| year_diff | 7.774 | 13.94 | 0.5579 |
| remoteness | -15485 | 1866 | -8.299 |
| vegetation_classedDesert | -274.9 | 545.5 | -0.504 |
| vegetation_classedPolar Desert/Rock/Ice | 218.3 | 183.1 | 1.192 |
| vegetation_classedSecondary Tropical Evergreen Broadleaf Forest | 1166 | 875.2 | 1.332 |
| vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF | 2718 | 507.8 | 5.353 |
| stat_cause_descrCampfire | 184.8 | 507.8 | 0.3639 |
| stat_cause_descrChildren | -217 | 420.9 | -0.5156 |
| stat_cause_descrDebris Burning | -326 | 260.3 | -1.252 |
| stat_cause_descrEquipment Use | -247.7 | 341.9 | -0.7243 |
| stat_cause_descrFireworks | -21.4 | 692.9 | -0.03089 |
| stat_cause_descrLightning | 2341 | 381.2 | 6.142 |
| stat_cause_descrMiscellaneous | -141.6 | 288.3 | -0.4913 |
| stat_cause_descrMissing/Undefined | 283.6 | 411.9 | 0.6885 |
| stat_cause_descrPowerline | 1427 | 794.8 | 1.795 |
| stat_cause_descrRailroad | -558.4 | 600.9 | -0.9292 |
| stat_cause_descrSmoking | -615.1 | 600.6 | -1.024 |
| stat_cause_descrStructure | -492.2 | 1376 | -0.3578 |
| Pr(>|t|) | |
|---|---|
| (Intercept) | 0.00000000000001241 |
| PC1 | 0.0000000002809 |
| PC2 | 0.01133 |
| PC3 | 0.04102 |
| PC4 | 0.09017 |
| year_diff | 0.577 |
| remoteness | 0.0000000000000001719 |
| vegetation_classedDesert | 0.6143 |
| vegetation_classedPolar Desert/Rock/Ice | 0.2335 |
| vegetation_classedSecondary Tropical Evergreen Broadleaf Forest | 0.1831 |
| vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF | 0.00000009469 |
| stat_cause_descrCampfire | 0.716 |
| stat_cause_descrChildren | 0.6062 |
| stat_cause_descrDebris Burning | 0.2105 |
| stat_cause_descrEquipment Use | 0.4689 |
| stat_cause_descrFireworks | 0.9754 |
| stat_cause_descrLightning | 0.0000000009517 |
| stat_cause_descrMiscellaneous | 0.6232 |
| stat_cause_descrMissing/Undefined | 0.4912 |
| stat_cause_descrPowerline | 0.07273 |
| stat_cause_descrRailroad | 0.3529 |
| stat_cause_descrSmoking | 0.3059 |
| stat_cause_descrStructure | 0.7205 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 2442 | 4058 | 0.09372 | 0.08548 |
lm_mw2 <- update(lm_mw, . ~ . - stat_cause_descr - PC3 - year_diff - PC2)
summary(lm_mw2) %>% pander| Estimate | Std. Error | t value | |
|---|---|---|---|
| (Intercept) | 4417 | 480.3 | 9.197 |
| PC1 | -494.7 | 51.56 | -9.595 |
| PC4 | -85.95 | 71.69 | -1.199 |
| remoteness | -13760 | 1811 | -7.599 |
| vegetation_classedDesert | -338.4 | 545.9 | -0.6198 |
| vegetation_classedPolar Desert/Rock/Ice | 205.3 | 181.1 | 1.134 |
| vegetation_classedSecondary Tropical Evergreen Broadleaf Forest | 841.9 | 870.7 | 0.9669 |
| vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF | 3085 | 506.5 | 6.092 |
| Pr(>|t|) | |
|---|---|
| (Intercept) | 0.00000000000000000007683 |
| PC1 | 0.000000000000000000001992 |
| PC4 | 0.2307 |
| remoteness | 0.00000000000004223 |
| vegetation_classedDesert | 0.5354 |
| vegetation_classedPolar Desert/Rock/Ice | 0.2571 |
| vegetation_classedSecondary Tropical Evergreen Broadleaf Forest | 0.3337 |
| vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF | 0.000000001292 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 2442 | 4105 | 0.06685 | 0.06416 |
lm_w = lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed + stat_cause_descr, data = fire_train_complete[which(fire_train_complete$region == "West"),])
summary(lm_w) %>% pander| Estimate | Std. Error | t value | |
|---|---|---|---|
| (Intercept) | 18377 | 2008 | 9.151 |
| PC1 | -342.2 | 196.8 | -1.739 |
| PC2 | 502.3 | 234.5 | 2.142 |
| PC3 | 574.3 | 527.3 | 1.089 |
| PC4 | 515 | 427.7 | 1.204 |
| year_diff | 191.2 | 54.37 | 3.516 |
| remoteness | -44987 | 2168 | -20.75 |
| vegetation_classedDesert | -1175 | 1601 | -0.7339 |
| vegetation_classedOpen Shrubland | 2404 | 2674 | 0.8988 |
| vegetation_classedPolar Desert/Rock/Ice | -222 | 904.5 | -0.2454 |
| vegetation_classedSecondary Tropical Evergreen Broadleaf Forest | 804.4 | 1377 | 0.584 |
| vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF | -1072 | 1706 | -0.6282 |
| stat_cause_descrCampfire | 2517 | 2612 | 0.9637 |
| stat_cause_descrChildren | -313.2 | 2934 | -0.1068 |
| stat_cause_descrDebris Burning | -204.8 | 2003 | -0.1022 |
| stat_cause_descrEquipment Use | 1384 | 1877 | 0.7376 |
| stat_cause_descrFireworks | -3180 | 4276 | -0.7437 |
| stat_cause_descrLightning | 2019 | 1635 | 1.235 |
| stat_cause_descrMiscellaneous | -678.8 | 1769 | -0.3838 |
| stat_cause_descrMissing/Undefined | -2706 | 1971 | -1.373 |
| stat_cause_descrPowerline | -1045 | 3399 | -0.3075 |
| stat_cause_descrRailroad | -1699 | 3869 | -0.4393 |
| stat_cause_descrSmoking | -154 | 3331 | -0.04624 |
| stat_cause_descrStructure | -3989 | 8487 | -0.47 |
| Pr(>|t|) | |
|---|---|
| (Intercept) | 0.00000000000000000008529 |
| PC1 | 0.08213 |
| PC2 | 0.03224 |
| PC3 | 0.2761 |
| PC4 | 0.2286 |
| year_diff | 0.0004421 |
| remoteness | 0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002947 |
| vegetation_classedDesert | 0.463 |
| vegetation_classedOpen Shrubland | 0.3688 |
| vegetation_classedPolar Desert/Rock/Ice | 0.8062 |
| vegetation_classedSecondary Tropical Evergreen Broadleaf Forest | 0.5592 |
| vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF | 0.5299 |
| stat_cause_descrCampfire | 0.3353 |
| stat_cause_descrChildren | 0.915 |
| stat_cause_descrDebris Burning | 0.9186 |
| stat_cause_descrEquipment Use | 0.4608 |
| stat_cause_descrFireworks | 0.4571 |
| stat_cause_descrLightning | 0.2168 |
| stat_cause_descrMiscellaneous | 0.7012 |
| stat_cause_descrMissing/Undefined | 0.1699 |
| stat_cause_descrPowerline | 0.7585 |
| stat_cause_descrRailroad | 0.6605 |
| stat_cause_descrSmoking | 0.9631 |
| stat_cause_descrStructure | 0.6384 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 4352 | 23598 | 0.1268 | 0.1222 |
lm_w2 <- update(lm_w, . ~ . - stat_cause_descr - PC3 - PC4 - vegetation_classed)
summary(lm_w2) %>% pander| Estimate | Std. Error | t value | |
|---|---|---|---|
| (Intercept) | 19539 | 1151 | 16.98 |
| PC1 | -305.3 | 162.3 | -1.882 |
| PC2 | 587.4 | 209.3 | 2.806 |
| year_diff | 187.4 | 53.68 | 3.491 |
| remoteness | -46518 | 2044 | -22.76 |
| Pr(>|t|) | |
|---|---|
| (Intercept) | 0.00000000000000000000000000000000000000000000000000000000000001116 |
| PC1 | 0.05992 |
| PC2 | 0.005032 |
| year_diff | 0.0004867 |
| remoteness | 1.85e-108 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 4352 | 23602 | 0.1227 | 0.1219 |
lm_ne = lm(fire_size ~ PC1 + PC2 + PC3 + PC4 + year_diff + remoteness + vegetation_classed + stat_cause_descr, data = fire_train_complete[which(fire_train_complete$region == "NorthEast"),])
summary(lm_ne) %>% pander| Estimate | Std. Error | t value | |
|---|---|---|---|
| (Intercept) | -56.05 | 46.87 | -1.196 |
| PC1 | -6.488 | 9.053 | -0.7167 |
| PC2 | -12.24 | 6.177 | -1.982 |
| PC3 | -4.673 | 6.896 | -0.6776 |
| PC4 | 8.486 | 8.777 | 0.9668 |
| year_diff | 0.4043 | 2.001 | 0.2021 |
| remoteness | 1131 | 68.98 | 16.39 |
| vegetation_classedDesert | -21.49 | 66.98 | -0.3208 |
| vegetation_classedOpen Shrubland | -60.52 | 79.02 | -0.766 |
| vegetation_classedPolar Desert/Rock/Ice | 28.23 | 26.09 | 1.082 |
| vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF | 6.235 | 35.88 | 0.1738 |
| stat_cause_descrCampfire | -22.07 | 64.14 | -0.3441 |
| stat_cause_descrChildren | -26.46 | 82.96 | -0.3189 |
| stat_cause_descrDebris Burning | -36.62 | 39.7 | -0.9224 |
| stat_cause_descrEquipment Use | -30.52 | 67.42 | -0.4526 |
| stat_cause_descrFireworks | 9.788 | 411.3 | 0.0238 |
| stat_cause_descrLightning | -27.37 | 62.79 | -0.4359 |
| stat_cause_descrMiscellaneous | -37.79 | 33.96 | -1.113 |
| stat_cause_descrMissing/Undefined | 31.73 | 65.38 | 0.4853 |
| stat_cause_descrPowerline | -34.26 | 122.1 | -0.2805 |
| stat_cause_descrRailroad | -24.84 | 121.9 | -0.2038 |
| stat_cause_descrSmoking | -39.33 | 56.33 | -0.6982 |
| stat_cause_descrStructure | -31.6 | 411.3 | -0.07682 |
| Pr(>|t|) | |
|---|---|
| (Intercept) | 0.2319 |
| PC1 | 0.4737 |
| PC2 | 0.04762 |
| PC3 | 0.4981 |
| PC4 | 0.3338 |
| year_diff | 0.8399 |
| remoteness | 0.00000000000000000000000000000000000000000000000000000004354 |
| vegetation_classedDesert | 0.7484 |
| vegetation_classedOpen Shrubland | 0.4438 |
| vegetation_classedPolar Desert/Rock/Ice | 0.2794 |
| vegetation_classedTemperate Evergreen Needleleaf Forest TmpENF | 0.8621 |
| stat_cause_descrCampfire | 0.7308 |
| stat_cause_descrChildren | 0.7498 |
| stat_cause_descrDebris Burning | 0.3564 |
| stat_cause_descrEquipment Use | 0.6509 |
| stat_cause_descrFireworks | 0.981 |
| stat_cause_descrLightning | 0.663 |
| stat_cause_descrMiscellaneous | 0.266 |
| stat_cause_descrMissing/Undefined | 0.6276 |
| stat_cause_descrPowerline | 0.7791 |
| stat_cause_descrRailroad | 0.8385 |
| stat_cause_descrSmoking | 0.4852 |
| stat_cause_descrStructure | 0.9388 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 1689 | 409.1 | 0.1588 | 0.1477 |
lm_ne2 <- update(lm_ne, . ~ . - stat_cause_descr - vegetation_classed - year_diff - PC4 - PC3 - PC2)
summary(lm_ne2) %>% pander| Estimate | Std. Error | t value | |
|---|---|---|---|
| (Intercept) | -44.59 | 13.83 | -3.225 |
| PC1 | -10.2 | 7.279 | -1.401 |
| remoteness | 1103 | 64.76 | 17.03 |
| Pr(>|t|) | |
|---|---|
| (Intercept) | 0.001286 |
| PC1 | 0.1614 |
| remoteness | 0.000000000000000000000000000000000000000000000000000000000004196 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 1689 | 407.9 | 0.1539 | 0.1529 |
The whole region’s RMSE is 10,325.41. We use it for comparing the model’s performance. The model performance will be checked with RMSE.
whole_pred = predict(lm_whole2, newdata = fire_test_complete)
test_MSE <- mean((fire_test_complete$fire_size - whole_pred)^2)
#print test RMSE
whole_test_RMSE = sqrt(test_MSE)sw_pred = predict(lm_sw2, newdata = fire_test_complete[which(fire_test_complete$region == "SouthWest"),])
sw_test_MSE <- mean((fire_test_complete$fire_size[which(fire_test_complete$region == "SouthWest")] - sw_pred)^2)
#print test RMSE
sw_test_RMSE = sqrt(sw_test_MSE)
se_pred = predict(lm_se2, newdata = fire_test_complete[which(fire_test_complete$region == "SouthEast"),])
se_test_MSE <- mean((fire_test_complete$fire_size[which(fire_test_complete$region == "SouthEast")] - se_pred)^2)
#print test RMSE
se_test_RMSE = sqrt(se_test_MSE)
mw_pred = predict(lm_mw2, newdata = fire_test_complete[which(fire_test_complete$region == "MidWest"),])
mw_test_MSE <- mean((fire_test_complete$fire_size[which(fire_test_complete$region == "MidWest")] - mw_pred)^2)
#print test RMSE
mw_test_RMSE = sqrt(mw_test_MSE)
w_pred = predict(lm_w2, newdata = fire_test_complete[which(fire_test_complete$region == "West"),])
w_test_MSE <- mean((fire_test_complete$fire_size[which(fire_test_complete$region == "West")] - w_pred)^2)
#print test RMSE
w_test_RMSE = sqrt(w_test_MSE)
ne_pred = predict(lm_w2, newdata = fire_test_complete[which(fire_test_complete$region == "NorthEast"),])
ne_test_MSE <- mean((fire_test_complete$fire_size[which(fire_test_complete$region == "NorthEast")] - ne_pred)^2)
#print test RMSE
ne_test_RMSE = sqrt(ne_test_MSE)
table_lm = data.frame(Region = c("Whole US", "North East", "Mid West", "West", "South West", "South East"),
RMSE = c(whole_test_RMSE, ne_test_RMSE, mw_test_RMSE, w_test_RMSE, sw_test_RMSE, se_test_RMSE),
NumberofObservation = c(length(whole_pred),length(ne_pred),length(mw_pred),length(w_pred),length(sw_pred),length(se_pred)))
kable(table_lm, caption = "Linear Model's RMSE")| Region | RMSE | NumberofObservation |
|---|---|---|
| Whole US | 10228.90 | 6713 |
| North East | 20688.09 | 435 |
| Mid West | 5640.77 | 630 |
| West | 15576.66 | 1058 |
| South West | 11565.08 | 1581 |
| South East | 3281.71 | 3009 |
KNN (short for K Nearest Neighbor) is a non-linear algorithm that works for both classification and regression problems. The basic premise behind the model is that it predicts the dependent variable based on how similar they are to other observations where the dependent variable is known.
KNN works by calculating euclidean distance between observations, so all inputs have to be numeric. Therefore, we have to do some pre-model data changes to our training and test sets. For both test data and training data we dummy code categorical variables, then we center and scale the data. As before, we assume that the training and test data are completely separated, so we process the two data sets separately.
#first we dummy code categorical variables (what about ordinal?)
knn_ready_train <- dummy_cols(fire_train_complete, select_columns =
c("stat_cause_descr", "region", "vegetation_classed", "season")) %>%
#then we get rid of non-dummy coded variables
select(-stat_cause_descr, - region, - vegetation_classed, -season) %>%
#then convert back to a data frame (as output for `dummy_cols` is a matrix)
as.data.frame()
#then we center and scale the data, except our outcome
knn_ready_train[,-1] <- scale(knn_ready_train[,-1], center = T, scale = T)
#of course our next step is to do the same to the test data
knn_ready_test <- dummy_cols(fire_test_complete,
select_columns = c("stat_cause_descr", "region", "vegetation_classed", "season")) %>%
select(-stat_cause_descr, - region, - vegetation_classed, -season) %>%
as.data.frame()
knn_ready_test[,-1] <- scale(knn_ready_test[,-1], center = T, scale = T)Next we split up the training and test data into a data frame that has only independent variables and a data frame that has only dependent variables (in this case fire_size). We do this for both the test and the training data.
#YTrain is the true values for fire size on the training set
YTrain = knn_ready_train$fire_size
#XTrain is the design matrix for training data
XTrain = knn_ready_train %>%
select(-fire_size)
#YTest is the true value for fire_size on the test set
YTest = knn_ready_test$fire_size
#Xtest is the design matrix for test data
XTest = knn_ready_test %>%
select(-fire_size)Then we use Leave One Out Cross Validation to determine the best number of neighbors to consider. A low number of neighbors considered results in a highly flexible model, while a higher number results in a less flexible model. For this process we built a function which we enter a starting value of K, an ending value of K, and the sampling interval. The result is a data frame of MSE values for each value of K that we test. This process is computationally intensive so we automatically saved results to a csv.
#make a function that saves KNN LOOCV results for different values of K
knn_loocv <- function(startk, endk, interval)
{
#set up possible number of nearest neighbors to be considered
allK = seq(from = startk, to = endk, by = interval)
#create a vector of the same length to save the MSE in later
k_mse = rep(NA, length(allK))
#for each number in allK, use LOOCV to find a validation error
for (i in allK){
#loop through different number of neighbors
#predict on the left-out validation set
pred.Yval = knn.cv(train = XTrain, cl = YTrain, k = i)
#find the mse for each value of k
k_mse[i] = mean((as.numeric(pred.Yval) - YTrain)^2)
}
#save as a data frame and filter out NAs (caused by skipping values of k if interval is larger than 1)
k_mse <- as.data.frame(k_mse) %>%
filter(!is.na(k_mse))
#bind with k value
knn_loocv_results <- cbind(as.data.frame(allK), k_mse)
#save MSE as CSV (because the cross validation process takes a long time)
write_csv(knn_loocv_results, paste0("model_results/knn/knn_mse_k", startk,"_k", endk, "_by", interval, ".csv"))
}
#we tried several different sets of k
knn_loocv(startk = 1, endk = 20, interval = 1)
knn_loocv(startk = 1, endk = 500, interval = 20)Next, we go through our results for all the values of K that we tested. We find that the MSE increases as K increases.
#read in all the stored k values
knn_mse_k1_k500_by20 <- read_csv(here("model_results", "knn", "knn_mse_k1_k500_by20.csv"))
knn_mse_k1_k20_by1 <- read_csv(here("model_results", "knn", "knn_mse_k1_k20_by1.csv"))
#plot MSE for values of k
plot(knn_mse_k1_k500_by20$allK, knn_mse_k1_k500_by20$k_mse, type = "l", xlab = "k", ylab = "MSE", main = "MSE for K 1 through 500 at intervals of 20")When we look at K 1-20 we get the lowest MSE at around 3 before it starts increasing.
plot(knn_mse_k1_k20_by1$allK, knn_mse_k1_k20_by1$k_mse, type = "l", xlab = "k", ylab = "MSE", main = "MSE for K 1 through 20 at intervals of 1")Just to confirm, we will print out the best number of neighbors.
#best number of neighbors
numneighbor = max(knn_mse_k1_k20_by1$allK[knn_mse_k1_k20_by1$k_mse == min(knn_mse_k1_k20_by1$k_mse)])
#print best number of neighbors
numneighbor## [1] 2
Now that we’ve found the best number of nearest neighbors to consider, we try out KNN on our test data! Below is the test MSE and the test RMSE.
#fit KNN model with best neighbor value to test data
pred.YTest = knn(train = XTrain, test = XTest, cl = YTrain, k = numneighbor)
#predict KNN
test_MSE <- mean((as.numeric(pred.YTest) - YTest)^2)
#print test MSE
test_MSE## [1] 83373364
#print test RMSE
sqrt(test_MSE)## [1] 9130.902
Over all, this model performs okay.
#train boosted tree model
fire_size_boost = gbm(fire_size~.,
data = fire_train,
n.trees = 500,
interaction.depth = 4
)
#the model summary creates a pretty visualization
summary(fire_size_boost)
#calculate training error
#predict values using training data
predictions_train_boost <- predict(fire_size_boost, data = fire_train)
#calculate rmse for training data
RMSE(predictions_train_boost, fire_train$fire_size)
#calculate test error
#predict values using test data
predictions_test_boost <- predict(fire_size_boost, data = fire_test)
#calculate rmse for training data
RMSE(predictions_test_boost, fire_test$fire_size)#train model
fire_size_rf = randomForest(fire_size ~ ., #writing the formula
data = fire_train_complete, #specifying training data to be used
mtry = 9, #setting number of variables to randomly sample per each split
ntree= 500, #setting number of trees
mode = "regression", #specifying regression
na.action = na.omit, #specifying what to do with NAs
importance = TRUE #specifying importance of variables should be assessed
)
#plot error
plot(fire_size_rf)
#plot variable importance
varImpPlot(fire_size_rf,
sort = T,
main = "Variable Importance for fire size random forest model",
n.var = 5)
#calculate training error
#predict values using training data
predictions_train_rf <- predict(fire_size_rf, data = fire_train)
#calculate rmse for training data
RMSE(predictions_train_rf, fire_train$fire_size)
#calculate test error
#predict values using test data
predictions_test_rf <- predict(fire_size_rf, data = fire_test)
#calculate rmse for training data
RMSE(predictions_test_rf, fire_test$fire_size)what type of modeling will we do? we must try 4 types
predictions without climate change
How do predictions change with 2.5C increase in temp?